Machine Learning Fundamentals

Author

Sarvenaz Mostafazadeh

1 Challenge: Which stock prices behave similarly?

##----libraries----
library(tidyverse)
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ dplyr     1.1.2     ✔ readr     2.1.4
#> ✔ forcats   1.0.0     ✔ stringr   1.5.0
#> ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
#> ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
#> ✔ purrr     1.0.1     
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag()    masks stats::lag()
#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyquant)
#> Loading required package: PerformanceAnalytics
#> Loading required package: xts
#> Loading required package: zoo
#> 
#> Attaching package: 'zoo'
#> 
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
#> 
#> 
#> ######################### Warning from 'xts' package ##########################
#> #                                                                             #
#> # The dplyr lag() function breaks how base R's lag() function is supposed to  #
#> # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
#> # source() into this session won't work correctly.                            #
#> #                                                                             #
#> # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
#> # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
#> # dplyr from breaking base R's lag() function.                                #
#> #                                                                             #
#> # Code in packages is not affected. It's protected by R's namespace mechanism #
#> # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
#> #                                                                             #
#> ###############################################################################
#> 
#> Attaching package: 'xts'
#> 
#> The following objects are masked from 'package:dplyr':
#> 
#>     first, last
#> 
#> 
#> Attaching package: 'PerformanceAnalytics'
#> 
#> The following object is masked from 'package:graphics':
#> 
#>     legend
#> 
#> Loading required package: quantmod
#> Loading required package: TTR
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(broom)
library(umap)
library(dplyr)
library(tidyr)
library(tibble)
library(ggplot2)
library(ggrepel)

##----.read the data----
sp_500_prices_tbl <- readRDS("C:/Users/mosta/Desktop/Business Decisions with Machine Learning/1/sp_500_prices_tbl.rds")
sp_500_prices_tbl
sp_500_index_tbl <- readRDS("C:/Users/mosta/Desktop/Business Decisions with Machine Learning/1/sp_500_index_tbl.rds")
sp_500_index_tbl
##----.Step1 (Convert stock prices to a standardized format (daily returns)----
sp_500_daily_returns_tbl <- sp_500_prices_tbl %>%
  filter(date >= as.Date("2018-01-01")) %>%
  select(symbol, date, adjusted) %>%
  group_by(symbol) %>%
  mutate(lag_adjusted = lag(adjusted),
         pct_return = (adjusted - lag_adjusted) / lag_adjusted) %>%
  filter(!is.na(pct_return)) %>%
  select(symbol, date, pct_return) 
  sp_500_daily_returns_tbl
##----.Step2 (Convert to User-Item Format)----
stock_date_matrix_tbl <- sp_500_daily_returns_tbl %>%
  select(symbol, date, pct_return) %>%
  pivot_wider(names_from = date, values_from = pct_return, values_fill = 0) %>%
  ungroup()
stock_date_matrix_tbl
##----.step3 (Perform K-Means Clustering)----
?kmeans
#> starting httpd help server ... done
kmeans_obj <- stock_date_matrix_tbl %>%
  select(-symbol) %>%
  kmeans(centers = 4, nstart = 20)
#Get the tot.withinss using glance()
glance(kmeans_obj)
##----.step4 (Find the optimal value of K)----
kmeans_mapper <- function(center = 4) {
  stock_date_matrix_tbl %>%
    select(-symbol) %>%
    kmeans(centers = center, nstart = 20)
}

4 %>% kmeans_mapper() %>% glance()
kmeans_mapped_tbl <- tibble(centers = 1:30) %>%
  mutate(k_means = centers %>% map(kmeans_mapper)) %>%
  mutate(glance  = k_means %>% map(glance))
#> Warning: There was 1 warning in `mutate()`.
#> ℹ In argument: `k_means = centers %>% map(kmeans_mapper)`.
#> Caused by warning:
#> ! did not converge in 10 iterations
kmeans_mapped_tbl %>%
  unnest(glance) %>%
  select(centers, tot.withinss)
#Scree Plot
kmeans_mapped_tbl %>%
  unnest(glance) %>%
  select(centers, tot.withinss) %>%
  ggplot(aes(centers, tot.withinss)) +
  geom_point(color = "#2DC6D6", size = 4) +
  geom_line(color = "#2DC6D6", size = 1) +
  ggrepel::geom_label_repel(aes(label = centers), color = "#2DC6D6",max.overlaps = 30) + 
  labs(title = "Scree Plot",
       subtitle = "Measures the distance each of the symbols are from the closes K-Means center",
       caption = "Conclusion: Based on the Scree Plot, We can see that the Scree Plot becomes linear (constant rate of change) between 5 and 10 centers for K.")
#> Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
#> ℹ Please use `linewidth` instead.

##----.step5 (Apply UMAP)----
?umap
umap_results <- stock_date_matrix_tbl %>%
  select(-symbol) %>%
  umap()

umap_results_tbl <- umap_results$layout %>%
  as_tibble(.name_repair = "unique") %>% 
  set_names(c("x", "y")) %>%
  bind_cols(
    stock_date_matrix_tbl %>% select(symbol)
  )
#> New names:
#> • `` -> `...1`
#> • `` -> `...2`
umap_results_tbl %>%
  ggplot(aes(x, y)) +
  geom_point(size=0.5) + 
  geom_label_repel(aes(label = "UMAP Projection"), size = 3)
#> Warning: ggrepel: 494 unlabeled data points (too many overlaps). Consider
#> increasing max.overlaps

#----.step6 (Combine K-Means and UMAP)----
kmeans_obj <- kmeans_mapped_tbl %>%
  pull(k_means) %>%
  pluck(10)

kmeans_clusters_tbl <- kmeans_obj %>% 
  augment(stock_date_matrix_tbl) %>%
  select(symbol, .cluster)
umap_kmeans_results_tbl <- umap_results_tbl %>%
  left_join(kmeans_clusters_tbl, by = "symbol") %>%
  left_join(sp_500_index_tbl %>% select(symbol, company, sector), by = "symbol")

umap_kmeans_results_tbl %>%
  mutate(label_text = str_glue("Customer: {symbol}
                               Cluster: {.cluster}")) %>%
  
  ggplot(aes(x, y, color = .cluster)) +
  geom_point(size=0.5) +
  geom_label_repel(aes(label = label_text), size = 2, fill = "blue", color = "white", max.overlaps = 30) +
  scale_color_manual(values = palette_light() %>% rep(3)) +
  labs(title = "2D Projection",
       subtitle = "UMAP 2D Projection with K-Means Cluster Assignment") +
  theme(legend.position = "none")
#> Warning: ggrepel: 175 unlabeled data points (too many overlaps). Consider
#> increasing max.overlaps